library(data.table)
library(tidyverse)
library(Seurat)
library(patchwork)
library(reactable)
# set your working directory
setwd("~/Documents/HGEN_663/extra/lec8")
Here we will analyze single-cell RNA-seq data from mouse retina, obtained at postnatal day 14. Fresh tissue was dissected, dissociated, and the cell suspension was immediately subjected to the Drop-seq protocol for cell capture and sequencing.
We will start from the Drop-seq output (a matrix of read counts per gene, per cell) of the MacCarrol sample, and perform basic quality control, dimensionality reduction and clustering. We will then analyze the cell populations obtained, infer their cell-type identity and obtain their gene signatures.
Load gene count estimates per cell, and build the basic Seurat object we’ll be using for analysis.
set.seed(42)
DGEmatrix <- fread('Dropseq_p14_retina_Mccarroll.txt', sep = "\t") %>%
data.frame(row.names = 1)
# create our base Seurat object, discarding cells that have less than 100 genes
# and discarding genes that are expressed in less than 3 cells.
Seurat_object <- CreateSeuratObject(DGEmatrix, assay = "RNA",
min.cells = 3, min.features = 100,
project = "Mccarroll_retina")
# number of genes and cells
dim(DGEmatrix)
## [1] 26058 6120
# check DGE matrix
DGEmatrix[1:3, 1:3]
## NORM.P14.WR.Mccarrol_r3_GTACTTAATTTC
## 0610005C13RIK 0
## 0610007N19RIK 0
## 0610007P14RIK 0
## NORM.P14.WR.Mccarrol_r3_CCGGAGGTTAGG
## 0610005C13RIK 0
## 0610007N19RIK 0
## 0610007P14RIK 2
## NORM.P14.WR.Mccarrol_r3_CTGTCCCCCTAT
## 0610005C13RIK 0
## 0610007N19RIK 0
## 0610007P14RIK 1
We will compute some QC metrics that reflect the overall quality of the sample: coverage (# of genes, # of UMIs) and mitochondrial read proportions – which is often an indicator of stress/cell damage
# get a list of mitochondrial genes: all genes starting with 'MT'
mito.genes <- grep("^MT-", rownames(x = Seurat_object@assays$RNA), value = TRUE)
# compute proportions
percent.mito <- Matrix::colSums(Seurat_object@assays$RNA[mito.genes,]) /
Matrix::colSums(Seurat_object@assays$RNA)
# add the information back to the seurat object
Seurat_object <- AddMetaData(object = Seurat_object, metadata = percent.mito,
col.name = "percent.mito")
# get a list of crystal genes (all genes starting by 'CRY') to remove crystalin contamination
crystal.genes <- grep("^CRY[AB]", rownames(x = Seurat_object@assays$RNA), value = TRUE)
# compute proportions
percent.crystal <- Matrix::colSums(Seurat_object@assays$RNA[crystal.genes, ]) /
Matrix::colSums(Seurat_object@assays$RNA)
# add the information back to the seurat object
Seurat_object <- AddMetaData(object = Seurat_object, metadata = percent.crystal,
col.name = "percent.crystal")
# display distribution of some metrics:
# # of genes, # of UMIs, and mitochondrial/crystal proportion
fts <- c("nFeature_RNA", "nCount_RNA", "percent.mito", "percent.crystal")
VlnPlot(object = Seurat_object, ncol = 4, pt.size = 0, features = fts)
# Get some summary stats for the sample:
summary_stats_before_filtering <- tibble(
total_cells = nrow(Seurat_object@meta.data),
mean_n_genes = mean(Seurat_object@meta.data$nFeature_RNA),
sd_n_genes = sd(Seurat_object@meta.data$nFeature_RNA),
max_n_genes = max(Seurat_object@meta.data$nFeature_RNA),
min_n_genes = min(Seurat_object@meta.data$nFeature_RNA),
mean_UMI = mean(Seurat_object@meta.data$nCount_RNA),
sd_UMI = sd(Seurat_object@meta.data$nCount_RNA),
max_UMI = max(Seurat_object@meta.data$nCount_RNA),
min_UMI = min(Seurat_object@meta.data$nCount_RNA)
) %>% mutate_all(function(x) round(x, 2))
summary_stats_before_filtering %>% reactable::reactable()
We saw that the distribution of some metrics showed some low quality cells. Although those are not many cells, they can bother for clustering and cell-type identification. Let’s get rid of the worst offenders, and recompute QC metrics
# We'll define some cutoffs to remove cells that deviate too much from the rest of the
# cells in the sample. These can be hard cut-offs or dataset-specific ones based on SD.
# Sometimes those represent multiplets, or just abnormal cells like crystalin cells
Seurat_object <- subset(Seurat_object, subset = nFeature_RNA > 100 & nFeature_RNA < 6000 &
nCount_RNA < 10000 & percent.mito < 0.10 & percent.crystal < 0.025)
VlnPlot(object = Seurat_object, pt.size = 0, ncol = 4, features = fts)
# Get some summary stats for the sample:
summary_stats_after_filtering <- tibble(
total_cells = nrow(Seurat_object@meta.data),
mean_n_genes = mean(Seurat_object@meta.data$nFeature_RNA),
sd_n_genes = sd(Seurat_object@meta.data$nFeature_RNA),
max_n_genes = max(Seurat_object@meta.data$nFeature_RNA),
min_n_genes = min(Seurat_object@meta.data$nFeature_RNA),
mean_UMI = mean(Seurat_object@meta.data$nCount_RNA),
sd_UMI = sd(Seurat_object@meta.data$nCount_RNA),
max_UMI = max(Seurat_object@meta.data$nCount_RNA),
min_UMI = min(Seurat_object@meta.data$nCount_RNA)
) %>% mutate_all(function(x) round(x, 2))
summary_stats_after_filtering %>% reactable()
print(
paste0("Number of filtered cells: ",
summary_stats_before_filtering$total_cells -
summary_stats_after_filtering$total_cells
)
)
## [1] "Number of filtered cells: 261"
Not much was removed, only a few outlier cells, because this was a quite good quality data to begin with. In other datasets, the before/after QC metrics will look much more different.
Normalize the dataset using SCTransform
# before proceeding, we note markers for cell cycle
Seurat_object <- CellCycleScoring(Seurat_object, set.ident = FALSE,
s.features = cc.genes$s.genes,
g2m.features = cc.genes$g2m.genes)
# we then go on to apply a transforms the data and regresses out unwanted variation
vs <- c("nFeature_RNA", "percent.mito", "S.Score", "G2M.Score")
Seurat_object <- SCTransform(Seurat_object, verbose = T,
vars.to.regress = vs)
FeatureScatter(Seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mito") +
FeatureScatter(Seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") &
theme(legend.position = 'none')
We will perform a first dimension reduction by using only the most variable genes. Then, we’ll transform the data using principal component analysis (PCA), and use only the first few components for downstream analyses.
# a bit of wrangling to prepare for VariableFeaturePlot
Seurat_object[['SCT']]@meta.features <- SCTResults(Seurat_object[['SCT']], slot = 'feature.attributes')[, c('gmean', 'variance', 'residual_variance')]
Seurat_object[['SCT']]@meta.features$variable <- F
Seurat_object[['SCT']]@meta.features[VariableFeatures(Seurat_object[['SCT']]), 'variable'] <- F
colnames(Seurat_object[["SCT"]]@meta.features) <- paste0("sct.", colnames(Seurat_object[["SCT"]]@meta.features))
# label top 10 most variable features
VariableFeaturePlot(Seurat_object, selection.method = 'sct', assay = 'SCT') %>%
LabelPoints(points = head(VariableFeatures(Seurat_object), 10), repel = T) &
theme(legend.position = 'none')
# perform PCA
Seurat_object <- RunPCA(Seurat_object, pcs.compute = 100, do.print = F)
# display genes correlated with PCs 1 & 2
VizDimLoadings(Seurat_object, dims = 1:2, reduction = "pca")
# alternative representation of genes highly correlated with PC1
DimHeatmap(Seurat_object, dims = 1:15, cells = 500, balanced = T)
# inspect the standard deviation of each PC
ElbowPlot(Seurat_object)
# assess effect of cell cycle
DimPlot(Seurat_object, reduction = "pca", group.by = "Phase")
Seurat_object <- RunUMAP(Seurat_object, dims = 1:20)
Seurat_object <- RunTSNE(Seurat_object, dims = 1:20)
# cluster the cells based on UMAP reduction
Seurat_object <- FindNeighbors(Seurat_object,
dims = 1:2,
reduction = "umap")
# the resolution can be adjusted to tweak clustering accuracy
Seurat_object <- FindClusters(Seurat_object,
resolution = 0.5,
reduction = "umap")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5858
## Number of edges: 127383
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9409
## Number of communities: 22
## Elapsed time: 0 seconds
DimPlot(Seurat_object, reduction = "umap", label = T)
DimPlot(Seurat_object, reduction = "tsne", label = T)
VlnPlot(object = Seurat_object, pt.size = 0.1, ncol = 1, features = fts)
We now have some clusters of cells (cell populations), but we still don’t know what these cells are. One can apply differential expression analysis to find marker genes higher expressed in every given cluster as compared to all remaining cells, and then infer the cell type based on current knowledge on those genes.
Seurat_object.markers <- FindAllMarkers(Seurat_object, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
head(Seurat_object.markers) %>% reactable()
We can take a look at the top 10 markers in each cluster
top10 <- Seurat_object.markers %>%
group_by(cluster) %>%
slice_max(avg_log2FC, n = 10) %>%
ungroup()
DotPlot(
Seurat_object,
assay = NULL,
unique(top10$gene),
cols = c("blue", "red"),
col.min = -2.5,
col.max = 2.5,
dot.min = 0,
dot.scale = 6,
group.by = NULL,
split.by = NULL,
scale.by = "radius",
scale.min = NA,
scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
Alternatively, one could directly use known marker genes or published expression profiles of purified / sorted cells.
markers.retina.dotplot <- toupper(c(
"Rho", #Rods
"Opn1mw", #Cones
"Trpm1", #Bipolar cells
"Scgn", #Bipolar cells
"Celf4", #Amacrine cells
"Rgs5", #Pericytes
"Cldn5", #Endothelial cells
"Lyz2", #Immune cells
"Glul", # Muller glia
"Gfap", #Astrocytes
"Optc", #Lens cells
"Lhx1" #Horizontal cells
))
DefaultAssay(Seurat_object) <- 'RNA'
FeaturePlot(Seurat_object, features = 'TRPM1', reduction = 'umap')
DefaultAssay(Seurat_object) <- 'SCT'
DotPlot(
Seurat_object,
assay = NULL,
markers.retina.dotplot,
cols = c("blue", "red"),
col.min = -2.5,
col.max = 2.5,
dot.min = 0,
dot.scale = 6,
group.by = NULL,
split.by = NULL,
scale.by = "radius",
scale.min = NA,
scale.max = NA
) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))
DefaultAssay(Seurat_object) <- 'SCT'
VlnPlot(object = Seurat_object, features = markers.retina.dotplot, pt.size = 0.1, stack=TRUE)
Activity 1: Assign as many cluster labels as you can based on the provided information
DefaultAssay(Seurat_object) <- 'SCT'
Seurat_object <- SetIdent(Seurat_object, cells = NULL, value="seurat_clusters")
new.cluster.ids <- c("", # cluster 0
"", # cluster 1
"", # cluster 2
"", # cluster 3
"", # cluster 4
"", # cluster 5
"", # cluster 6
"", # cluster 7
"", # cluster 8
"", # cluster 9
"", # cluster 10
"", # cluster 11
"", # cluster 12
"", # cluster 13
"", # cluster 14
"", # cluster 15
"", # cluster 16
"", # cluster 17
"", # cluster 18
"", # cluster 19
"" # cluster 20
)
names(new.cluster.ids) <- levels(Seurat_object)
Seurat_object <- RenameIdents(Seurat_object, new.cluster.ids)
Seurat_object[["Cell_Type"]] <- Idents(object = Seurat_object)
DimPlot(Seurat_object, reduction = "umap", label = T)
Now that you know how to process single-cell samples, perform QC, identify cell populations and assign identities, you are ready to analyze a dataset on your own! We have another mouse retinal sample from the Joyal Lab.
set.seed(42)
DGEmatrix <- fread('Dropseq_p14_retina_Mccarroll.txt', sep = "\t") %>%
data.frame(row.names = 1)